Analyze selection data using soluble Ephrin-B2 or -B3¶

In [1]:
# this cell is tagged as parameters for `papermill` parameterization
#input configs
altair_config = None
nipah_config = None

#input files
entropy_file = None
func_scores_E2_file = None
binding_E2_file = None
func_scores_E3_file = None
binding_E3_file = None

#output files
filtered_E2_binding_data = None
filtered_E3_binding_data = None
filtered_E2_binding_low_effect = None
filtered_E3_binding_low_effect = None

#output images
entry_binding_combined_corr_plot = None
entry_binding_combined_corr_plot_agg = None
E2_E3_correlation = None
E2_E3_correlation_site = None
combined_E2_E3_site_corr = None
binding_by_site_plot = None
entry_binding_corr_heatmap = None
binding_corr_heatmap = None
In [2]:
# Parameters
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
entropy_file = "results/entropy/entropy.csv"
func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
binding_E2_file = "results/receptor_affinity/averages/EFNB2_monomeric_mut_effect.csv"
func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
binding_E3_file = "results/receptor_affinity/averages/EFNB3_dimeric_mut_effect.csv"
filtered_E2_binding_data = "results/filtered_data/E2_binding_filtered.csv"
filtered_E3_binding_data = "results/filtered_data/E3_binding_filtered.csv"
filtered_E2_binding_low_effect = (
    "results/filtered_data/E2_binding_low_effect_filter.csv"
)
filtered_E3_binding_low_effect = (
    "results/filtered_data/E3_binding_low_effect_filter.csv"
)
entry_binding_combined_corr_plot = (
    "results/images/entry_binding_combined_corr_plot.html"
)
entry_binding_combined_corr_plot_agg = (
    "results/images/entry_binding_combined_corr_plot_agg.html"
)
E2_E3_correlation = "results/images/E2_E3_correlation.html"
E2_E3_correlation_site = "results/images/E2_E3_correlation_site.html"
combined_E2_E3_site_corr = "results/images/combined_E2_E3_site_corr.html"
binding_by_site_plot = "results/images/binding_by_site_plot.html"
entry_binding_corr_heatmap = "results/images/entry_binding_corr_heatmap.html"
binding_corr_heatmap = "results/images/binding_corr_heatmap.html"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")
Setup in correct directory
In [5]:
if nipah_config is None:
##hard paths in case don't want to run with snakemake
    print('loading hard paths')
    altair_config = "data/custom_analyses_data/theme.py"
    nipah_config = "nipah_config.yaml"
    entropy_file = 'results/entropy/entropy.csv'
    
    #input files
    func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
    binding_E2_file = "results/receptor_affinity/averages/EFNB2_monomeric_mut_effect.csv"
    func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
    binding_E3_file = "results/receptor_affinity/averages/EFNB3_dimeric_mut_effect.csv"

    filtered_E2_binding_data="results/filtered_data/E2_binding_filtered.csv"
    filtered_E3_binding_data="results/filtered_data/E3_binding_filtered.csv"
    filtered_E2_binding_low_effect="results/filtered_data/E2_binding_low_effect_filter.csv"
    filtered_E3_binding_low_effect="results/filtered_data/E3_binding_low_effect_filter.csv"
In [6]:
print(filtered_E2_binding_data)
results/filtered_data/E2_binding_filtered.csv

Run config files to setup altair theme and config variables¶

In [7]:
if altair_config:
    with open(altair_config, 'r') as file:
        exec(file.read())

with open(nipah_config) as f:
    config = yaml.safe_load(f)

Make the E2/E3 dataframes, filter separately, then merge¶

In [8]:
#import binding and entry data
e2 = pd.read_csv(binding_E2_file)
e2_func = pd.read_csv(func_scores_E2_file)
e3 = pd.read_csv(binding_E3_file)
e3_func = pd.read_csv(func_scores_E3_file)
In [9]:
def merge_func_binding_dfs(func,binding,name):
    df_int = pd.merge(
        binding,
        func,
        on=['site','mutant','wildtype'],
        suffixes=['_binding','_cell_entry'],
        validate='one_to_one',
        how='outer'
    ).round(3)
    df = df_int.rename(columns={'Ephrin binding_mean':'binding_mean','Ephrin binding_std':'binding_std','Ephrin binding_median':'binding_median'})

    # Only save relevant columns
    df = df[['site','wildtype','mutant','binding_median','binding_std','times_seen_binding','effect','effect_std','times_seen_cell_entry','frac_models']]
    
    def filter_binding_data(df):
        df_filter = df[
            (df['mutant'] != '*') &
            (df['mutant'] != '-') &
            (df['site'] != 603) &
            # Filter cell entry parameters
            (df['effect'] >= config['min_func_effect_for_binding']) &
            (df['times_seen_cell_entry'] >= config['func_times_seen_cutoff']) &
            (df['effect_std'] <= config['func_std_cutoff']) &
            # Filter binding parameters
            (df['times_seen_binding'] >= config['min_times_seen_binding']) &
            (df['binding_std'] <= config['max_binding_std']) &
            (df['frac_models'] >= config['frac_models'])
        ]
        return df_filter

    df_filter = filter_binding_data(df)
    
    #For pulling out low effect mutants for heatmaps later. Find mutants below func effect cutoff, but still have ok times_seen and func_std.
    def store_filtered_info(df):
        df_low_filter = df[
            (df['mutant'] != '*') &
            (df['mutant'] != '-') &
            (df['site'] != 603) &
            (df['effect'] < config['min_func_effect_for_binding']) &
            (df['times_seen_cell_entry'] >= config['func_times_seen_cutoff']) &
            (df['effect_std'] <= config['func_std_cutoff']) 
        ]
        return df_low_filter
    
    df_low_effect_filter = store_filtered_info(df)
    
    if name == 'EFNB2':
        print(name)
        df_filter.to_csv(filtered_E2_binding_data,index=False)
        df_low_effect_filter.to_csv(filtered_E2_binding_low_effect,index=False)
    else:
        df_filter.to_csv(filtered_E3_binding_data,index=False)
        df_low_effect_filter.to_csv(filtered_E3_binding_low_effect,index=False)
    
    return df_filter,df_low_effect_filter

#Call filtering function
df_E2_filter,df_E2_filter_missing = merge_func_binding_dfs(e2_func,e2,'EFNB2')
df_E3_filter,df_E3_filter_missing = merge_func_binding_dfs(e3_func,e3,'EFNB3')

#Now that they are filtered, merge EFNB2 and EFNB3
df_binding_effect_merge = pd.merge(
    df_E2_filter,
    df_E3_filter,
    on=['site','wildtype','mutant'],
    suffixes=['_E2','_E3'],
    how='outer'
)

#display stats
display(df_binding_effect_merge.describe().round(3))

# Make a concat df of E2/E3 data for plotting later
df_E2_filter['selection'] = 'EFNB2'
df_E3_filter['selection'] = 'EFNB3'
df_binding_effect_concat = pd.concat([df_E2_filter,df_E3_filter])
EFNB2
site binding_median_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 binding_median_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3
count 7489.000 6971.000 6971.000 6971.000 6971.000 6971.000 6971.000 6971.000 6819.000 6819.000 6819.000 6819.000 6819.000 6819.000 6819.0
mean 342.073 -0.324 0.511 5.919 -0.139 0.375 7.353 0.997 -0.019 0.186 5.720 -0.104 0.400 6.313 1.0
std 148.724 1.110 0.326 2.955 0.599 0.192 4.149 0.027 0.279 0.181 2.841 0.569 0.188 3.316 0.0
min 71.000 -5.284 0.004 2.000 -2.000 0.018 2.000 0.750 -1.960 0.000 2.000 -1.997 0.044 2.000 1.0
25% 215.000 -0.410 0.277 4.000 -0.428 0.229 4.750 1.000 -0.154 0.061 4.000 -0.361 0.260 4.286 1.0
50% 340.000 -0.022 0.435 5.250 0.038 0.343 6.500 1.000 -0.015 0.137 5.000 0.063 0.370 5.571 1.0
75% 468.000 0.206 0.666 7.000 0.318 0.488 8.750 1.000 0.118 0.252 7.000 0.314 0.509 7.286 1.0
max 602.000 2.602 1.989 39.750 0.640 1.000 64.380 1.000 2.008 1.958 40.000 0.660 0.998 49.140 1.0

Make nice interactive plot for correlation between binding and entry for EFNB2 and EFNB3¶

In [10]:
def plot_corr_binding_entry_updated(df,flag):
    variant_selector = alt.selection_point(
        on="mouseover",
        empty=False,
        fields=["site","mutant"],
        value=0
    )  
    variant_selector_agg = alt.selection_point(
        on="mouseover",
        empty=False,
        fields=["site"],
        value=0
    )  
    slider = alt.binding_range(min=2, max=10, step=1, name="times seen")
    selector = alt.param(name="SelectorName", value=2, bind=slider)

    empty_chart = []
    
    for cell in list(df['selection'].unique()):
        tmp_df = df[df['selection'] == cell]
        if flag == True:
            agg_df = tmp_df.groupby('site')[['binding_median','effect']].sum().reset_index()
            chart = alt.Chart(agg_df).mark_point(stroke='black',filled=True).encode(
                x=alt.X('effect', title=f'Median {cell} Cell Entry', axis=alt.Axis(grid=True)),
                y=alt.Y('binding_median', title=f'Summed {cell} Binding', axis=alt.Axis(grid=True)),
                opacity=alt.condition(variant_selector_agg, alt.value(1), alt.value(0.2)),
                size=alt.condition(variant_selector_agg,alt.value(100),alt.value(50)),
                strokeWidth=alt.condition(variant_selector_agg,alt.value(1),alt.value(0)),
                color=alt.condition(variant_selector_agg,alt.value('orange'),alt.value('black')),
                tooltip=['site', 'binding_median','effect'],
            ).properties(
                width=200,
                height=200,
            ).add_params(variant_selector_agg)
            
            empty_chart.append(chart)
        
        
        else:
            chart = alt.Chart(tmp_df).mark_point(stroke='black',filled=True).encode(
                x=alt.X('effect', title=f'{cell} Cell Entry', axis=alt.Axis(grid=True)),
                y=alt.Y('binding_median', title=f'{cell} Binding', axis=alt.Axis(grid=True)),
                opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.1)),
                size=alt.condition(variant_selector,alt.value(50),alt.value(20)),
                strokeWidth=alt.condition(variant_selector,alt.value(1),alt.value(0)),
                color=alt.condition(variant_selector,alt.value('orange'),alt.value('black')),
                tooltip=['site', 'wildtype', 'mutant','binding_median','times_seen_binding','effect'],
            ).properties(
                width=200,
                height=200,
            ).add_params(variant_selector,selector).transform_filter(
                alt.datum.times_seen_binding >= selector
            )
            empty_chart.append(chart)
    
    combined_chart = alt.hconcat(*empty_chart,title=alt.Title('Correlation between binding and entry'))
    return combined_chart

entry_binding_corr_plot = plot_corr_binding_entry_updated(df_binding_effect_concat,False)
entry_binding_corr_plot.display()
entry_binding_corr_plot.save(entry_binding_combined_corr_plot)

entry_binding_corr_plot_agg = plot_corr_binding_entry_updated(df_binding_effect_concat,True)
entry_binding_corr_plot_agg.display()
entry_binding_corr_plot_agg.save(entry_binding_combined_corr_plot_agg)
In [11]:
def plot_entry_binding_corr_heatmap(df):
    empty_chart = []
    
    for cell in list(df['selection'].unique()):
        tmp_df = df[df['selection'] == cell]
        chart = alt.Chart(tmp_df,title=f'{cell}').mark_rect().encode(
            x=alt.X('effect',title='Cell Entry',axis=alt.Axis(values=[-2,-1,0,1])).bin(maxbins=60),
            y=alt.Y('binding_median',title='Binding',axis=alt.Axis(values=[-4,-2,0,2])).bin(maxbins=60),
            color=alt.Color('count()',title='Count').scale(scheme='greenblue'),
            #tooltip=['effect','binding_median']
        )
        empty_chart.append(chart)
    
    combined_chart = alt.hconcat(*empty_chart,title=alt.Title('Correlation between binding and entry')).resolve_scale(y='shared',x='shared',color='shared')
    return combined_chart

entry_binding_corr_heat = plot_entry_binding_corr_heatmap(df_binding_effect_concat)
entry_binding_corr_heat.display()
entry_binding_corr_heat.save(entry_binding_corr_heatmap)
In [12]:
def overall_stats(df,effect,name):
    #Find quantiles
    quantile_2 = df['binding_median'].quantile(.02)
    quantile_98 = df['binding_median'].quantile(.98)
    print(f'The 2% quantile for {name} is: {quantile_2}')
    print(f'The 98% quantile for {name} is: {quantile_98}')

    #Now group sites and find intolerant sites 
    filtered_df = df.groupby('site').filter(lambda group: (group[effect] <-0.25).all())
    unique = filtered_df['site'].unique()
    # Convert unique to a Pandas Series
    unique_series = pd.Series(unique)
    #print(unique_series)
    # Find the common elements
    unique_contact_bool = unique_series.isin(config['contact_sites'])
    #print(unique_contact_bool)
    # Filter and get the common elements
    common_elements = unique_series[unique_contact_bool]

    # Print the common elements
    print(f'Here are the contact sites that are conserved: {common_elements}')
    
    print(f'There are {len(unique)} sites with all negative binding score mutants for {name}')
    print(f'These are the sites for {name} with all negative binding score mutants: {list(unique)}')

    #Now find sites with low and high binding (median)
    median_df = df.groupby('site')['binding_median'].median().reset_index().sort_values(by='binding_median')
    print(f'For {name}, these are the sites with lowest median binding scores: {median_df.head(5)}')
    median_df = df.groupby('site')['binding_median'].median().reset_index().sort_values(by='binding_median',ascending=False)
    print(f'For {name}, these are the sites with highest median binding scores: {median_df.head(5)}')
    
    #Now calculate mutant number
    total_mutants = df.shape[0]
    upper_cutoff = df[df[effect] > 1].sort_values(by='binding_median',ascending=False)
    median_upper = upper_cutoff['effect'].median()
    print(f'The median entry score for top binders was: {median_upper}')
    
    mutants_above_cutoff_tolerated = upper_cutoff[upper_cutoff['effect'] > 0]
    mutants_above_cutoff_tolerated = mutants_above_cutoff_tolerated[['site','effect','binding_median','wildtype','mutant']]
    print(f'The mutants with positive entry scores and good binding are: {mutants_above_cutoff_tolerated.head(5)}')
    
    lower_cutoff = df[df[effect] < -1]
    
    print(f'For {name}, there are a total of : {total_mutants} binding mutants')
    print(f'For {name}, there are {upper_cutoff.shape[0]} mutants above cutoff, and {mutants_above_cutoff_tolerated.shape[0]} that have good entry scores')
    print(f'For {name}, there are {lower_cutoff.shape[0]} mutants below cutoff')
 
    total_sites = df['site'].unique().shape[0]
    
    print(f'The total number of sites are: {total_sites}')


overall_stats(df_E2_filter,'binding_median','E2')
overall_stats(df_E3_filter,'binding_median','E3')
The 2% quantile for E2 is: -3.9
The 98% quantile for E2 is: 1.207199999999999
Here are the contact sites that are conserved: 4     240
5     242
10    389
21    490
22    491
27    501
28    504
29    505
31    531
32    532
33    533
34    557
35    579
36    581
39    588
dtype: int64
There are 41 sites with all negative binding score mutants for E2
These are the sites for E2 with all negative binding score mutants: [116, 220, 236, 238, 240, 242, 243, 248, 346, 351, 389, 390, 398, 400, 438, 439, 441, 460, 467, 486, 487, 490, 491, 494, 495, 497, 500, 501, 504, 505, 526, 531, 532, 533, 557, 579, 581, 584, 585, 588, 590]
For E2, these are the sites with lowest median binding scores:      site  binding_median
446   533         -4.0450
409   494         -4.0250
407   491         -3.9815
406   490         -3.8745
404   487         -3.8715
For E2, these are the sites with highest median binding scores:      site  binding_median
33    104          1.4065
48    120          1.3630
132   207          1.2080
129   204          1.1770
49    122          1.1750
The median entry score for top binders was: -0.925
The mutants with positive entry scores and good binding are:       site  effect  binding_median wildtype mutant
1324   139   0.005           1.989        N      Y
5448   354   0.228           1.498        S      T
9533   566   0.081           1.415        F      H
7170   444   0.163           1.256        I      F
1217   134   0.099           1.249        S      I
For E2, there are a total of : 6971 binding mutants
For E2, there are 229 mutants above cutoff, and 22 that have good entry scores
For E2, there are 1040 mutants below cutoff
The total number of sites are: 514
The 2% quantile for E3 is: -0.654
The 98% quantile for E3 is: 0.606
Here are the contact sites that are conserved: 3    389
7    501
8    505
9    531
dtype: int64
There are 10 sites with all negative binding score mutants for E3
These are the sites for E3 with all negative binding score mutants: [108, 243, 352, 389, 486, 494, 497, 501, 505, 531]
For E3, these are the sites with lowest median binding scores:      site  binding_median
417   501         -0.9160
444   531         -0.7235
309   389         -0.6870
492   584         -0.6810
419   505         -0.6660
For E3, these are the sites with highest median binding scores:      site  binding_median
497   589          0.6805
56    129          0.5005
59    132          0.4675
155   231          0.4310
449   537          0.4275
The median entry score for top binders was: -0.764
The mutants with positive entry scores and good binding are:        site  effect  binding_median wildtype mutant
7995    492   0.496           1.200        Q      L
2632    211   0.037           1.149        G      F
10015   598   0.344           1.141        P      G
1655    161   0.234           1.011        S      E
For E3, there are a total of : 6819 binding mutants
For E3, there are 27 mutants above cutoff, and 4 that have good entry scores
For E3, there are 12 mutants below cutoff
The total number of sites are: 511

Find sites with opposite effects on binding¶

In [13]:
#find sites that are different
def find_biggest_differences(df):
    
    df = df[df['site'].isin(config['contact_sites'])]
    efnb2_good_efnb3_bad = df[
        (df['binding_median_E2'] > 0.1) &
        (df['binding_median_E3'] < -0.1)
    ].sort_values(by='binding_median_E2',ascending=False)
    display(efnb2_good_efnb3_bad)
    efnb2_bad_efnb3_good = df[
        (df['binding_median_E2'] < -0.1) &
        (df['binding_median_E3'] > 0.1)
    ].sort_values(by='binding_median_E3',ascending=False)
    display(efnb2_bad_efnb3_good)


find_biggest_differences(df_binding_effect_merge)
site wildtype mutant binding_median_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 binding_median_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3
1965 241 S L 0.867 0.292 6.50 -0.425 0.175 8.625 1.0 -0.136 0.087 7.0 0.158 0.277 6.571 1.0
1960 241 S F 0.686 0.544 7.00 -0.385 0.442 9.375 1.0 -0.110 0.440 6.5 -0.230 0.228 6.857 1.0
1961 241 S G 0.169 0.409 5.25 -0.339 0.686 4.375 1.0 -0.274 0.197 5.0 -0.096 0.226 5.857 1.0
site wildtype mutant binding_median_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 binding_median_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3
5517 492 Q L -0.167 0.523 5.00 0.514 0.344 4.500 1.0 1.200 0.031 5.0 0.496 0.296 4.857 1.0
6787 588 I P -2.239 0.805 4.75 -0.187 0.393 4.500 1.0 1.052 0.055 5.0 -0.637 0.369 5.000 1.0
5520 492 Q R -2.917 0.613 17.75 0.013 0.309 22.750 1.0 0.650 0.076 18.5 0.409 0.225 19.290 1.0
5492 491 S A -0.905 0.480 5.50 -0.593 0.444 5.750 1.0 0.604 0.045 6.0 0.234 0.523 6.143 1.0
4229 402 R M -0.429 0.281 5.00 -0.237 0.615 5.250 1.0 0.307 0.048 5.0 -0.144 0.276 5.857 1.0
6643 580 I E -0.439 0.769 5.00 -0.576 0.750 4.125 1.0 0.255 0.145 4.5 0.523 0.586 5.143 1.0
1938 239 S G -0.133 0.834 2.00 0.145 0.548 2.000 1.0 0.218 0.168 2.0 0.344 0.384 2.000 1.0
6390 559 Q L -0.281 0.619 3.50 -0.777 0.548 6.250 1.0 0.181 0.014 4.0 -0.598 0.734 4.571 1.0
5484 490 Q L -1.473 0.300 7.00 0.419 0.087 7.875 1.0 0.146 0.270 6.5 0.537 0.245 5.857 1.0
4040 388 Q Y -0.152 0.607 5.50 0.502 0.087 5.625 1.0 0.141 0.000 5.5 0.549 0.098 5.143 1.0
4039 388 Q W -0.214 0.220 5.50 0.386 0.246 6.125 1.0 0.117 0.070 5.5 -0.205 0.246 6.429 1.0
1991 242 R W -1.284 0.649 6.50 0.093 0.355 6.875 1.0 0.114 0.182 6.0 -0.441 0.444 6.286 1.0
5677 507 V M -1.357 0.772 6.00 0.419 0.080 7.375 1.0 0.102 0.614 4.0 -0.441 0.506 5.143 1.0

Find correlations between EFNB2 and EFNB3 binding¶

In [14]:
def plot_entry_binding_corr(df):
    chart = alt.Chart(df,title='Correlation Between Ephrin Binding Scores').mark_rect().encode(
        x=alt.X('binding_median_E2',title='EFNB2 binding',axis=alt.Axis(values=[-5,0,2])).bin(maxbins=40),
        y=alt.Y('binding_median_E3',title='EFNB3 binding',axis=alt.Axis(values=[-2,0,2])).bin(maxbins=40),
        color=alt.Color('count()',title='Count').scale(scheme='greenblue'),
    ).properties(
        height=200,
        width=200,
    )
    return chart

entry_binding_corr_heatmap_1 = plot_entry_binding_corr(df_binding_effect_merge)
entry_binding_corr_heatmap_1.display()
entry_binding_corr_heatmap_1.save(binding_corr_heatmap)
In [15]:
def plot_affinity_solid(df):
    slider = alt.binding_range(min=1, max=20, step=1, name="times_seen")
    selector = alt.param(name="SelectorName", value=1, bind=slider)
    df = df.dropna()
    # calculate r value
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df['binding_median_E2'], df['binding_median_E3'])
    r_value = float(r_value)
    # make chart
    chart = alt.Chart(df,title=alt.Title('Correlation between Mutant Binding Scores',subtitle=f'r={r_value:.2f}')).mark_point(color='black',size=30, opacity=0.2,filled=True).encode(
        x=alt.X('binding_median_E2', title=('EFNB2 Binding')),
        y=alt.Y('binding_median_E3', title=('EFNB3 Binding')),
        tooltip=['site', 'wildtype','mutant','binding_median_E2','binding_median_E3','effect_E2','effect_E3'],
    ).properties(
        width=200, 
        height=200
    ).add_params(selector).transform_filter(
            (alt.datum.times_seen_binding_E2 >= selector)
    )
    min = int(df['binding_median_E2'].min())
    max = int(df['binding_median_E3'].max())
    text = alt.Chart({'values':[{'x': min, 'y': max, 'text': f'r = {r_value:.2f}'}]}).mark_text(
        align='left', baseline='top', dx=-10, dy=-20).encode(
            x=alt.X('x:Q'),
            y=alt.Y('y:Q'),
            text='text:N'
        )
    chart_and_text = chart
    return chart_and_text

E2_E3_corr = plot_affinity_solid(df_binding_effect_merge)
E2_E3_corr.display()
E2_E3_corr.save(E2_E3_correlation)

Plot correlations between summary statistics for each site¶

In [16]:
def plot_affinity_solid_mean(df):
    df = df.dropna()
    means = df.groupby('site').agg({
            'effect_E2': 'median',
            'effect_E3': 'median',
            'binding_median_E2': 'median',
            'binding_median_E3': 'median',
            'wildtype': 'first'
        }).reset_index()
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(means['binding_median_E2'], means['binding_median_E3'])
    r_value = float(r_value)
    chart = alt.Chart(means,title=alt.Title('Correlation between Aggregate Mutant Binding Scores',subtitle=f'r={r_value:.2f}')).mark_point(size=50, color='black', opacity=0.3,filled=True).encode(
            x=alt.X('binding_median_E2', title=('Median Ephrin-B2 Binding'), axis=alt.Axis(tickCount=3)),
            y=alt.Y('binding_median_E3', title=('Median Ephrin-B3 Binding'), axis=alt.Axis(tickCount=3)),
            tooltip=['site', 'wildtype','binding_median_E2','binding_median_E3','effect_E2','effect_E3'],
        ).properties(
            width=200, 
            height=200
    )
    text = alt.Chart({'values':[{'x': -3.5, 'y': 0.5, 'text': f'r = {r_value:.2f}'}]}).mark_text(
        align='left', baseline='top', dx=0, dy=-10).encode(
            x=alt.X('x:Q'),
            y=alt.Y('y:Q'),
            text='text:N'
        )
    chart_and_text = chart #+ text
    return chart_and_text#.display()

E2_E3_site_corr = plot_affinity_solid_mean(df_binding_effect_merge)
E2_E3_site_corr.display()
E2_E3_site_corr.save(E2_E3_correlation_site)

(E2_E3_site_corr | E2_E3_corr).save(combined_E2_E3_site_corr)

Make plot showing binding by site (median)¶

In [17]:
def plot_affinity_by_site_median(df):
    variant_selector = alt.selection_point(
        on="mouseover",
        empty=False,
        fields=["site"],
        value=0
    )  
    empty_charts = []
    for selection in ['binding_median_E2','binding_median_E3']:
        if selection == 'binding_median_E2':
            name = 'EFNB2 Binding'
        else:
            name = 'EFNB3 Binding'
        mean = df.groupby('site')[selection].sum().reset_index()
        chart = alt.Chart(mean).mark_point(size=60, color='black', stroke='black',filled=True).encode(
            x=alt.X('site', title=('Site'), axis=alt.Axis(grid=True, tickCount=4),scale=alt.Scale(domain=[70,602])),
            y=alt.Y(selection, title=(name), axis=alt.Axis(grid=True, tickCount=4)),
            tooltip=['site'],
            color=alt.condition(variant_selector, alt.value('orange'), alt.value('black')),
            opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.4)),
            strokeWidth=alt.condition(variant_selector,alt.value(2),alt.value(0))
        ).properties(
            width=500, 
            height=100
        ).add_params(variant_selector)
        empty_charts.append(chart)
    combined_chart = alt.vconcat(*empty_charts, spacing=1,title='Summed Binding by Site')
    return combined_chart



binding_by_site = plot_affinity_by_site_median(df_binding_effect_merge)
binding_by_site.display()
binding_by_site.save(binding_by_site_plot)

Make bubble plots for binding in different areas of receptor pocket¶

In [18]:
def make_boxplot_binding_region(df,title):# Create a box plot using Altair for aggregated means
    barrel_ranges = {
    'Hydrophobic': config['hydrophobic'],
    'Salt Bridges': config['salt_bridges'],
    'Hydrogen Bonds': config['h_bond_total'],
    'Contact': config['contact_sites'],
    'Overall': list(range(71,602)),
    }
    
    mean_df = df.groupby('site')[['binding_median']].median().reset_index()
    custom_order = ['Hydrophobic','Salt Bridges','Hydrogen Bonds','Contact','Overall']
    agg_means = []
    
    # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = mean_df[mean_df['site'].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append({'barrel': barrel, 'effect': row['binding_median'],'site':row['site']})
        agg_means_df = pd.DataFrame(agg_means)
    chart = alt.Chart(agg_means_df).mark_point(filled=True,size=70,opacity=0.4,color='black').encode(
                x=alt.X('barrel:O', sort=custom_order,title=None,axis=alt.Axis(labelAngle=-90)),
                y=alt.Y('effect',title=f'Median {title} Binding',axis=alt.Axis(grid=True,tickCount=4)),
                xOffset='random:Q',
                #color = alt.Color('barrel').legend(None),
                tooltip=['barrel', 'effect','site'],
            ).transform_calculate(
                random="sqrt(-1*log(random()))*cos(2*PI*random())"
        
            ).properties(
                height=alt.Step(20),
                width=alt.Step(25)
            )
    
    return chart.display()

make_boxplot_binding_region(df_E2_filter,'EFNB2')
make_boxplot_binding_region(df_E3_filter,'EFNB3')

Plot histogram¶

In [19]:
def effect_histogram(df):
    colors = {'E2': '#1f4e79', 'E3': '#ff7f0e'}
    all_charts = []
    for effect in ['E2','E3']:
        func_effect_container = []
        for func_effect in [-2]:# Melt the dataframe for the specific columns
            df = df[
                (df[f'effect_{effect}'] > func_effect)
            ]
            color = colors[effect]    
            df_melted = df.melt(value_vars=['binding_median_E2', 'binding_median_E3'], var_name='Effect', value_name='Value')
        
            # Histogram for 'effect_E2'
            histogram = alt.Chart(df_melted[df_melted['Effect'] == f'binding_median_{effect}']).mark_bar(opacity=1,color=color).encode(
                x=alt.X('Value', bin=alt.Bin(step=0.1), title=f'Binding {effect}',axis=alt.Axis(tickCount=4,values=[3,1,0,-1,-5]),scale=alt.Scale(domain=[-6,3])),
                y=alt.Y('count()',stack=None,title='Count'),
                #color=alt.Color('red'),
                tooltip=['Effect', 'count()']
            ).properties(
                width=150, 
                height=alt.Step(10)
            )
            func_effect_container.append(histogram)
        combined_effect_chart = alt.hconcat(*func_effect_container).resolve_scale(y='shared', x='shared', color='independent')
        all_charts.append(combined_effect_chart)
    final_combined_chart = alt.vconcat(*all_charts).resolve_scale(y='independent', x='independent', color='independent')
    return final_combined_chart.display()
    
effect_histogram(df_binding_effect_merge)
#effect_histogram(df_affinity_filter_merge,'#ff7f0e','binding_mean_E3')
In [ ]: